precision recall f1-score support
0 0.89 0.97 0.93 113138
1 0.83 0.53 0.65 28262
accuracy 0.88 141400
macro avg 0.86 0.75 0.79 141400 weighted avg 0.88 0.88 0.87 141400
Profile Report is available in same folder as HTML file.
Simple Imputer used because of time constrain, we can improve performance and data quality by using advance imputation algorithms. Try different categorical encoding algorithm to acheive better performance and accuracy, more hyperparameter tuning, ensambles Models, bagging boosting and stacking
!pip install -q klib
!pip install -q pandas-profiling
!pip install -q --pre pycaret
!pip install -q category_encoders
!pip install -q catboost
!pip install -q pandas-profiling
!pip install -q shap
import numpy as np
import pandas as pd
import category_encoders as ce
#pd.set_option('max_columns', None)
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
import klib
import shap
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, f1_score
from pandas.api.types import is_numeric_dtype
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool
# Loading Data, dropping unnessery colums, renaming columns
data = pd.read_csv("C:/Users/gmodi/Downloads/archive/claimData.csv",index_col=None, dtype={'place_of_service_code': 'category', 'is_denial': 'category'})
data = data.drop(['Unnamed: 0','firm_id','service_year','cpt_code_description','state','payer_name','is_approved','modifier_code_1','modifier_code_2','modifier_code_3','modifier_code_4'], axis=1)
data = data.rename(columns = {'primary_insurance_policy_type':'pipt','primary_insurance_policy_relationship':'pipr','place_of_service_code':'psc'})
cdata = klib.data_cleaning(data)
cdata.reset_index(drop=True, inplace=True)
prof = ProfileReport(cdata)
#prof.to_widgets()
prof.to_notebook_iframe()
#prof.to_file(output_file='ClaimDataProfile.html')
Shape of cleaned data: (999994, 19) - Remaining NAs: 1799982
Dropped rows: 10006
of which 10006 duplicates. (Rows (first 150 shown): [343282, 361243, 376695, 414927, 665220, 796582, 894425, 980241, 1000000, 1000001, 1000002, 1000003, 1000004, 1000005, 1000006, 1000007, 1000008, 1000009, 1000010, 1000011, 1000012, 1000013, 1000014, 1000015, 1000016, 1000017, 1000018, 1000019, 1000020, 1000021, 1000022, 1000023, 1000024, 1000025, 1000026, 1000027, 1000028, 1000029, 1000030, 1000031, 1000032, 1000033, 1000034, 1000035, 1000036, 1000037, 1000038, 1000039, 1000040, 1000041, 1000042, 1000043, 1000044, 1000045, 1000046, 1000047, 1000048, 1000049, 1000050, 1000051, 1000052, 1000053, 1000054, 1000055, 1000056, 1000057, 1000058, 1000059, 1000060, 1000061, 1000062, 1000063, 1000064, 1000065, 1000066, 1000067, 1000068, 1000069, 1000070, 1000071, 1000072, 1000073, 1000074, 1000075, 1000076, 1000077, 1000078, 1000079, 1000080, 1000081, 1000082, 1000083, 1000084, 1000085, 1000086, 1000087, 1000088, 1000089, 1000090, 1000091, 1000092, 1000093, 1000094, 1000095, 1000096, 1000097, 1000098, 1000099, 1000100, 1000101, 1000102, 1000103, 1000104, 1000105, 1000106, 1000107, 1000108, 1000109, 1000110, 1000111, 1000112, 1000113, 1000114, 1000115, 1000116, 1000117, 1000118, 1000119, 1000120, 1000121, 1000122, 1000123, 1000124, 1000125, 1000126, 1000127, 1000128, 1000129, 1000130, 1000131, 1000132, 1000133, 1000134, 1000135, 1000136, 1000137, 1000138, 1000139, 1000140, 1000141])
Dropped columns: 0
of which 0 single valued. Columns: []
Dropped missing values: 18149
Reduced memory by at least: 91.63 MB (-68.94%)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
data0 = data.query('is_denial == "0" ')
data0.fillna(data0.mean())
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy ='most_frequent')
imputer = imputer.fit(data0)
data_with_imputed_values0 = pd.DataFrame(imputer.transform(data0))
data_with_imputed_values0.columns = data.columns
data1 = data.query('is_denial == "1" ')
data1.fillna(data1.mean())
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy ='most_frequent')
imputer = imputer.fit(data1)
data_with_imputed_values1 = pd.DataFrame(imputer.transform(data1)).copy()
data_with_imputed_values1.columns = data.columns
data_clean = pd.concat([data_with_imputed_values0, data_with_imputed_values1], ignore_index=True)
data_clean.shape
data_clean[['unitCharge', 'units','age','diagnosis_count','modifier_count','position']] = data_clean[['unitCharge', 'units','age','diagnosis_count','modifier_count','position']].round(0).astype(int)
df = klib.data_cleaning(data_clean)
df = klib.convert_datatypes(data_clean)
df.info()
Shape of cleaned data: (999813, 19) - Remaining NAs: 0
Dropped rows: 10187
of which 10187 duplicates. (Rows (first 150 shown): [80657, 93221, 119234, 132589, 141117, 152806, 160907, 164729, 176007, 184449, 195723, 201007, 201207, 209185, 258406, 271159, 274791, 277704, 279821, 288046, 296158, 304205, 305838, 307044, 323640, 334052, 337461, 339604, 341931, 343282, 357928, 361243, 362246, 363735, 367167, 376695, 380980, 381632, 392711, 400286, 403244, 407437, 414927, 415817, 420069, 423662, 424751, 437077, 438744, 446471, 447446, 451796, 453694, 455455, 461223, 464207, 465613, 470636, 472692, 478656, 480523, 485217, 485762, 487705, 488653, 488994, 491804, 493204, 494049, 501283, 507033, 513342, 515455, 515666, 525207, 526124, 526913, 532319, 532401, 541645, 541896, 542313, 551081, 565150, 565370, 576445, 577639, 579258, 580425, 581140, 585453, 595919, 598381, 600213, 606480, 608634, 609614, 610614, 619304, 621791, 623848, 630565, 630949, 631825, 636468, 638930, 639026, 643788, 644181, 644878, 648526, 649417, 653418, 653880, 658089, 658755, 659994, 661164, 664926, 665220, 666300, 667880, 678904, 690341, 693780, 695679, 698635, 705488, 706877, 709115, 716227, 717092, 721661, 722778, 725768, 727703, 729311, 732255, 739971, 742116, 742135, 747787, 748182, 749854, 752764, 753223, 754250, 758935, 759714, 760210])
Dropped columns: 0
of which 0 single valued. Columns: []
Dropped missing values: 0
Reduced memory by at least: 82.01 MB (-66.52%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010000 entries, 0 to 1009999
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 is_denial 1010000 non-null category
1 unitCharge 1010000 non-null int32
2 units 1010000 non-null int32
3 age 1010000 non-null int32
4 diagnosis_count 1010000 non-null int32
5 modifier_count 1010000 non-null int32
6 position 1010000 non-null int32
7 using_rcm 1010000 non-null category
8 cpt_code 1010000 non-null category
9 sex 1010000 non-null category
10 itemType 1010000 non-null category
11 diagnosis_code_1 1010000 non-null category
12 diagnosis_code_2 1010000 non-null category
13 diagnosis_code_3 1010000 non-null category
14 diagnosis_code_4 1010000 non-null category
15 payer_code 1010000 non-null category
16 pipt 1010000 non-null category
17 pipr 1010000 non-null category
18 psc 1010000 non-null category
dtypes: category(13), int32(6)
memory usage: 41.7 MB
mdf, data_test = train_test_split(df, stratify=data["is_denial"], test_size=0.30)
X = mdf[['unitCharge', 'age', 'diagnosis_count', 'position', 'cpt_code', 'sex','itemType', 'diagnosis_code_1',
'diagnosis_code_2', 'diagnosis_code_3', 'payer_code', 'pipt', 'pipr']]
y = mdf['is_denial']
def get_categorical_indicies(X):
cats = []
for col in X.columns:
if is_numeric_dtype(X[col]):
pass
else:
cats.append(col)
cat_indicies = []
for col in cats:
cat_indicies.append(X.columns.get_loc(col))
return cat_indicies
categorical_indicies = get_categorical_indicies(X)
def convert_cats(X):
cats = []
for col in X.columns:
if is_numeric_dtype(X[col]):
pass
else:
cats.append(col)
cat_indicies = []
for col in cats:
X[col] = X[col].astype('category')
convert_cats(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)
train_dataset = cb.Pool(X_train,y_train, cat_features=categorical_indicies)
test_dataset = cb.Pool(X_test,y_test, cat_features=categorical_indicies)
model = cb.CatBoostClassifier(loss_function='Logloss', eval_metric='Accuracy', verbose = False)
grid = {'learning_rate': [0.03, 0.1],'depth': [4, 6, 10],'l2_leaf_reg': [1, 3, 5,],'iterations': [50, 100, 150]}
model.grid_search(grid,train_dataset, verbose = False)
pred = model.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support
0 0.89 0.97 0.93 113138
1 0.83 0.53 0.65 28262
accuracy 0.88 141400
macro avg 0.86 0.75 0.79 141400
weighted avg 0.88 0.88 0.87 141400
val_data = (X_test, y_test)
def validate(model, val_data):
y = model.predict(val_data[0])
print('Accuracy =', accuracy_score(y, val_data[1]))
print('ROC AUC =', roc_auc_score(y, val_data[1]))
#print('F1 =', f1_score(y, val_data[1]))
validate(model, val_data)
Accuracy = 0.8849151343705799 ROC AUC = 0.862625709100155
## saving Model on Disk
model.save_model("model")
from_file = CatBoostClassifier()
from_file.load_model("model")
<catboost.core.CatBoostClassifier at 0x20b2421b730>
def plot_feature_importance(importance,names,model_type):
#Create arrays from feature importance and feature names
feature_importance = np.array(importance)
feature_names = np.array(names)
#Create a DataFrame using a Dictionary
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)
#Sort the DataFrame in order decreasing feature importance
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
#Define size of bar plot
plt.figure(figsize=(10,8))
#Plot Searborn bar chart
sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
#Add chart labels
plt.title(model_type + ' FEATURE IMPORTANCE')
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
plot_feature_importance(model.get_feature_importance(),X_train.columns,'CATBOOST')
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values[0:100,:], X.iloc[0:100,:])
model.calc_feature_statistics(test_dataset,feature='unitCharge',plot=True,prediction_type='Class')
{'borders': array([5.000e-01, 1.500e+00, 2.500e+00, 3.500e+00, 5.500e+00, 6.500e+00,
1.050e+01, 1.250e+01, 1.350e+01, 1.550e+01, 1.650e+01, 1.750e+01,
1.850e+01, 1.950e+01, 2.050e+01, 2.150e+01, 2.250e+01, 2.350e+01,
2.450e+01, 2.750e+01, 3.150e+01, 3.550e+01, 3.950e+01, 4.250e+01,
4.550e+01, 4.850e+01, 5.050e+01, 5.550e+01, 6.150e+01, 6.250e+01,
6.750e+01, 7.050e+01, 7.350e+01, 7.850e+01, 7.950e+01, 1.045e+02,
1.115e+02, 1.225e+02, 1.285e+02, 1.305e+02, 1.315e+02, 1.325e+02,
1.345e+02, 1.455e+02, 1.475e+02, 1.525e+02, 1.795e+02, 1.935e+02,
2.235e+02, 2.255e+02, 2.275e+02, 2.295e+02, 2.315e+02, 2.325e+02,
2.345e+02, 2.415e+02, 2.475e+02, 2.505e+02, 2.575e+02, 2.655e+02,
2.705e+02, 2.755e+02, 2.815e+02, 2.875e+02, 3.005e+02, 3.065e+02,
3.135e+02, 3.215e+02, 3.225e+02, 3.285e+02, 3.335e+02, 3.445e+02,
3.645e+02, 4.895e+02, 5.005e+02, 5.135e+02], dtype=float32),
'binarized_feature': array([ 3, 27, 68, ..., 37, 36, 35]),
'mean_target': array([3.8567689e-01, 3.4545454e-01, 3.6352658e-01, 3.5952637e-01,
3.2734737e-01, 3.3673468e-01, 3.2923776e-01, 3.4208104e-01,
3.4645671e-01, 3.3493334e-01, 3.6222222e-01, 3.4244791e-01,
3.2463768e-01, 3.3619210e-01, 3.7168142e-01, 3.4368530e-01,
3.3181819e-01, 3.6705202e-01, 3.0035335e-01, 3.6407766e-01,
2.8100777e-01, 2.2924902e-01, 2.0214669e-01, 2.4034335e-01,
2.2144112e-01, 2.4691358e-01, 2.0177384e-01, 2.0628525e-01,
1.9925047e-01, 1.6614421e-01, 1.9762845e-01, 2.1779425e-01,
2.0807175e-01, 1.9407721e-01, 1.9667590e-01, 1.8002166e-01,
1.7578579e-01, 1.7704581e-01, 1.7284326e-01, 1.5761821e-01,
1.8681319e-01, 1.3843352e-01, 1.5288889e-01, 1.5597504e-01,
1.4821592e-01, 1.6654412e-01, 1.5493155e-01, 1.3871413e-01,
1.3876736e-01, 1.1444653e-01, 1.1238095e-01, 1.2765957e-01,
1.2605043e-01, 9.3251330e-01, 1.3605443e-01, 1.5655431e-01,
1.4908722e-01, 1.3439636e-01, 1.2258797e-01, 1.5609756e-01,
1.6748768e-01, 1.6748768e-01, 1.6702820e-01, 1.3744076e-01,
1.3981043e-01, 1.5094340e-01, 1.5962441e-01, 1.7409766e-01,
6.1771972e-04, 1.4705883e-01, 1.8152866e-01, 1.8101934e-01,
1.7230769e-01, 1.5666594e-01, 2.0578778e-01, 1.4487633e-01,
1.5729940e-01], dtype=float32),
'mean_weighted_target': array([], dtype=float32),
'mean_prediction': array([3.52367371e-01, 2.80519485e-01, 2.80193239e-01, 2.85252959e-01,
2.35505387e-01, 2.50463814e-01, 2.27758810e-01, 2.22375691e-01,
2.30314955e-01, 2.14933336e-01, 2.17777774e-01, 2.09635422e-01,
2.28985503e-01, 2.14408234e-01, 2.28318587e-01, 2.17391297e-01,
2.34090909e-01, 2.08092481e-01, 2.04946995e-01, 2.05501616e-01,
1.76356584e-01, 1.50197625e-01, 1.03756711e-01, 1.41630903e-01,
1.23022847e-01, 1.35802463e-01, 9.53436792e-02, 1.13618053e-01,
1.13678947e-01, 7.83699080e-02, 1.06719367e-01, 1.20481931e-01,
1.27354264e-01, 1.11052357e-01, 8.86426568e-02, 9.89893526e-02,
9.02211890e-02, 8.49202275e-02, 8.86998773e-02, 7.09281936e-02,
8.60805884e-02, 7.10382536e-02, 8.71111080e-02, 7.72676393e-02,
7.31930435e-02, 8.49264711e-02, 7.87231103e-02, 6.88967332e-02,
6.38196692e-02, 6.75422102e-02, 6.47619069e-02, 6.38297871e-02,
7.35294148e-02, 9.68750000e-01, 8.61677974e-02, 7.56554306e-02,
7.80933052e-02, 6.15034178e-02, 7.94551671e-02, 8.65853652e-02,
6.65024593e-02, 9.85221639e-02, 8.89370963e-02, 9.24170613e-02,
8.76777247e-02, 7.00808614e-02, 6.57276958e-02, 7.85562620e-02,
1.76491347e-04, 5.00000007e-02, 9.55414027e-02, 1.35325134e-01,
1.29230767e-01, 9.99345407e-02, 1.15755625e-01, 9.89399329e-02,
7.21102878e-02], dtype=float32),
'objects_per_bin': array([ 4203, 770, 828, 929, 1949, 1078, 4395, 2172, 1016,
1875, 900, 768, 690, 583, 565, 483, 440, 346,
283, 618, 516, 506, 559, 466, 569, 648, 451,
1241, 1601, 319, 1518, 1079, 1115, 1891, 361, 11082,
3436, 5829, 3292, 1142, 546, 549, 1125, 6251, 1093,
2720, 14316, 6517, 10514, 533, 525, 470, 476, 3008,
441, 1335, 986, 439, 881, 820, 406, 406, 461,
422, 844, 371, 426, 471, 11332, 340, 314, 569,
975, 4583, 311, 283, 2829], dtype=uint32),
'predictions_on_varying_feature': array([0.16608204, 0.16619519, 0.16707214, 0.16756719, 0.16746818,
0.16649222, 0.16551627, 0.16500707, 0.16444837, 0.16455446,
0.16331683, 0.1618529 , 0.16084866, 0.15804102, 0.15748939,
0.15538897, 0.15149929, 0.14841584, 0.14376238, 0.13086987,
0.12033239, 0.1102546 , 0.10997171, 0.10454738, 0.09990099,
0.10115983, 0.09673267, 0.09828854, 0.09929279, 0.09988685,
0.10253182, 0.10346535, 0.10553041, 0.10565064, 0.1067256 ,
0.10736209, 0.10749646, 0.10867044, 0.10953324, 0.11013437,
0.11064356, 0.11088402, 0.11128006, 0.11268034, 0.11275106,
0.11304102, 0.11406648, 0.11442008, 0.11650636, 0.11541018,
0.11376945, 0.11722065, 0.14945545, 0.74287836, 0.17777228,
0.13178925, 0.12639321, 0.12550212, 0.12274399, 0.12006365,
0.10789958, 0.10938472, 0.11164781, 0.10637199, 0.10379774,
0.08731966, 0.08466054, 0.07736209, 0.01657001, 0.07118812,
0.09369873, 0.1008133 , 0.10648515, 0.11404526, 0.11485856,
0.12456153, 0.12727723])}
model.calc_feature_statistics(test_dataset,feature='diagnosis_count',plot=True,prediction_type='Class')
{'borders': array([1.5, 2.5, 3.5], dtype=float32),
'binarized_feature': array([0, 1, 0, ..., 0, 0, 0]),
'mean_target': array([0.24703453, 0.07103057, 0.15930331, 0.1250615 ], dtype=float32),
'mean_weighted_target': array([], dtype=float32),
'mean_prediction': array([0.16390255, 0.03343166, 0.08788233, 0.05500345], dtype=float32),
'objects_per_bin': array([91048, 21357, 18832, 10163], dtype=uint32),
'predictions_on_varying_feature': array([0.36389675, 0.05770156, 0.10427157, 0.10314003])}
model.calc_feature_statistics(test_dataset,feature='age',plot=True,prediction_type='Class')
{'borders': array([ 7.5, 11.5, 12.5, 13.5, 14.5, 25.5, 26.5, 28.5, 29.5, 30.5, 31.5,
32.5, 33.5, 34.5, 40.5, 44.5, 45.5, 48.5, 50.5, 51.5, 52.5, 53.5,
54.5, 55.5, 56.5, 57.5, 58.5, 59.5, 60.5, 61.5, 65.5, 66.5, 67.5,
69.5, 70.5, 71.5, 72.5, 73.5, 74.5, 75.5, 79.5, 80.5, 84.5],
dtype=float32),
'binarized_feature': array([32, 31, 13, ..., 34, 39, 5]),
'mean_target': array([0.19972453, 0.22526766, 0.25346112, 0.2527985 , 0.29059118,
0.29638708, 0.27567568, 0.25456053, 0.23381294, 0.19237435,
0.17857143, 0.16756757, 0.18506494, 0.16923077, 0.18145286,
0.19768882, 0.23211679, 0.21849202, 0.22191273, 0.23754789,
0.20069204, 0.17846154, 0.17632242, 0.79967993, 0.15731293,
0.18895966, 0.02527103, 0.18353067, 0.1946437 , 0.18022068,
0.17636485, 0.17791411, 0.1875 , 0.19273719, 0.19669802,
0.19100669, 0.1920112 , 0.19385211, 0.17813109, 0.16597977,
0.15855552, 0.16182573, 0.16484576, 0.20268872], dtype=float32),
'mean_weighted_target': array([], dtype=float32),
'mean_prediction': array([0.1446281 , 0.16788009, 0.18849841, 0.19682837, 0.2131557 ,
0.22563049, 0.2 , 0.18325041, 0.17086332, 0.14211439,
0.12678571, 0.1045045 , 0.13311689, 0.10940171, 0.12179289,
0.1328931 , 0.15036497, 0.16235553, 0.14763232, 0.14942528,
0.16262975, 0.12923077, 0.13224182, 0.85009336, 0.11564626,
0.09200283, 0.00778745, 0.09903122, 0.12003826, 0.11524315,
0.09920107, 0.09473114, 0.09567039, 0.09438332, 0.09391088,
0.09276889, 0.09039944, 0.08778732, 0.08273848, 0.07830648,
0.0676935 , 0.0680498 , 0.07712083, 0.08790072], dtype=float32),
'objects_per_bin': array([ 726, 2335, 939, 1072, 1201, 12649, 740, 1206, 556,
577, 560, 555, 616, 585, 3235, 2423, 685, 1817,
1077, 522, 578, 650, 794, 3749, 1176, 1413, 13098,
1858, 2091, 2447, 15020, 5542, 5728, 12089, 5633, 4937,
4281, 3611, 3082, 2669, 7726, 1205, 3112, 4835],
dtype=uint32),
'predictions_on_varying_feature': array([0.13262376, 0.13304102, 0.13471004, 0.13499293, 0.1357355 ,
0.13630835, 0.13359264, 0.13231966, 0.12258133, 0.11661952,
0.11427864, 0.11028996, 0.11003536, 0.10891796, 0.10874116,
0.10859972, 0.10920792, 0.10951909, 0.1109901 , 0.11067185,
0.12008487, 0.1234017 , 0.14150636, 0.59635078, 0.14381188,
0.09512023, 0.05547383, 0.1020297 , 0.10851485, 0.11245403,
0.11117397, 0.11243281, 0.1137058 , 0.11405941, 0.11417256,
0.11828147, 0.11978076, 0.12279349, 0.12528996, 0.13188826,
0.13062235, 0.13212164, 0.13483734, 0.13494342])}
model.calc_feature_statistics(test_dataset,feature='position',plot=True,prediction_type='Class')
{'borders': array([1.5, 2.5, 3.5, 4.5], dtype=float32),
'binarized_feature': array([0, 1, 1, ..., 3, 0, 2]),
'mean_target': array([0.19196048, 0.17050458, 0.23938844, 0.29234442, 0.29092032],
dtype=float32),
'mean_weighted_target': array([], dtype=float32),
'mean_prediction': array([0.11479974, 0.10911242, 0.15504499, 0.22116102, 0.18035825],
dtype=float32),
'objects_per_bin': array([62143, 45641, 23677, 6701, 3238], dtype=uint32),
'predictions_on_varying_feature': array([0.11497878, 0.12811174, 0.17513437, 0.19044554, 0.180686 ])}
test_objects = [X.iloc[0:1], X.iloc[91:92]]
for obj in test_objects:
print('Probability of class 1 = {:.4f}'.format(model.predict_proba(obj)[0][1]))
print('Formula raw prediction = {:.4f}'.format(model.predict(obj, prediction_type='RawFormulaVal')[0]))
print('\n')
Probability of class 1 = 0.9970 Formula raw prediction = 5.7931 Probability of class 1 = 0.6908 Formula raw prediction = 0.8037
mdf.query('is_denial == "0" ').diagnosis_count.value_counts()
1 343440 2 98836 3 78851 4 44563 Name: diagnosis_count, dtype: int64
mdf.query('is_denial == "1" ').diagnosis_count.value_counts()
1 112492 3 14615 2 7790 4 6413 Name: diagnosis_count, dtype: int64
shap.summary_plot(shap_values, X)
# 1. Library imports
import pandas as pd
from pycaret.regression import load_model, predict_model
from fastapi import FastAPI
import uvicorn
# 2. Create the app object
app = FastAPI()
#. Load trained Pipeline
model = load_model('Final_catboost_Model_08Feb2023')
# Define predict function
@app.post('/predict')
def predict(unitCharge,units,age,diagnosis_count,modifier_count,position,using_rcm,cpt_code,sex,itemType,modifier_code_1,modifier_code_2,modifier_code_3,modifier_code_4,diagnosis_code_1,diagnosis_code_2,diagnosis_code_3,diagnosis_code_4,payer_code,pipt,pipr,psc):
data = pd.DataFrame([[unitCharge,units,age,diagnosis_count,modifier_count,position,using_rcm,cpt_code,sex,itemType,modifier_code_1,modifier_code_2,modifier_code_3,modifier_code_4,diagnosis_code_1,diagnosis_code_2,diagnosis_code_3,diagnosis_code_4,payer_code,pipt,pipr,psc]])
data.columns = ['unitCharge','units','age','diagnosis_count','modifier_count','position','using_rcm','cpt_code','sex','itemType','modifier_code_1','modifier_code_2','modifier_code_3','modifier_code_4','diagnosis_code_1','diagnosis_code_2','diagnosis_code_3','diagnosis_code_4','payer_code','pipt','pipr','psc']
predictions = predict_model(model, data=data)
return {'prediction': int(predictions['Label'][0])}
if __name__ == '__main__':
uvicorn.run(app, host='127.0.0.1', port=8000)
# Execution Role : FastApiLambdaRole
# RunTime : Python 3.8
# Layer1 (ARN) : arn:aws:lambda:us-west-2:446751924810:layer:python-3-8-scikit-learn-0-23-1:4
# Layer2 (Custom) : pythonpackage(FastAPI libs)
import json
import joblib
import re
import string
from bs4 import BeautifulSoup
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
from mangum import Mangum
import tempfile
import boto3
s3_client = boto3.client("s3")
# Declaring our FastAPI instance
app = FastAPI()
lambda_handler = Mangum(app)
# Defining path operation for root endpoint
@app.get("/")
def main():
return {"message": "Welcome to AI!"}
class request_body(BaseModel):
unitCharge: string
units: string
age: string
diagnosis_count: string
modifier_count: string
position: string: stringusing_rcm: string
cpt_code: string
sex: string
itemType: string
modifier_code_1: string
modifier_code_2: string
modifier_code_3: string
modifier_code_4: string
diagnosis_code_1: string
diagnosis_code_2: string
diagnosis_code_3: string
diagnosis_code_4: string
payer_code: string
pipt: string
pipr: string
psc: string
@app.post("/SimulatedClaimData")
def ClaimData(data: request_body):
# READ Model From S3
with tempfile.TemporaryFile() as fp:
s3_client.download_fileobj(
Fileobj=fp,
Bucket="###-ai-s3-dev",
Key="./Final catboost Model 08Feb2023.pkl",
)
fp.seek(0)
model = joblib.load(fp)
prediction = predict_model(model, data=data_unseen)
prediction.[0]
return {prediction.[0]}